String similarity (bag of words)

Now that we've implemented the naive 'bag of functions' method, we attempt a bag of strings. A string is defined as a string of alphanumeric code longer than 2 characters that appears in the cell code.


In [2]:
# Necessary imports 
import os
import time
from nbminer.notebook_miner import NotebookMiner
from nbminer.cells.cells import Cell
from nbminer.features.ast_features import ASTFeatures
from nbminer.stats.summary import Summary
from nbminer.stats.multiple_summary import MultipleSummary

In [3]:
#Loading in the notebooks
people = os.listdir('../testbed/Final')
notebooks = []
for person in people:
    person = os.path.join('../testbed/Final', person)
    if os.path.isdir(person):
        direc = os.listdir(person)
        notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])
notebook_objs = [NotebookMiner(file) for file in notebooks[:5]]
a = ASTFeatures(notebook_objs)

In [3]:
# Now lets compute the jaccard similarity between each cell
s = time.time()
rd, cls = a.ranked_cell_similarity()
print ('Time elapsed: ', time.time()-s)


0 / 272
Time elapsed:  1.3567440509796143

In [4]:
short_similarities = []
full_similarities = []
call_similarities = []
string_similarities = []
for key in rd:
    short_similarities.append(rd[key]['short_dict_similarity'])
    full_similarities.append(rd[key]['full_dict_similarity'])
    call_similarities.append(rd[key]['call_dict_similarity'])
    string_similarities.append(rd[key]['string_dict_similarity'])

In [5]:
import numpy as np
short_similarities = np.array(short_similarities)
full_similarities = np.array(full_similarities)
call_similarities = np.array(call_similarities)
string_similarities = np.array(string_similarities)

In [6]:
print (np.mean(short_similarities))
print (np.mean(full_similarities))
print (np.mean(call_similarities))
print (np.mean(string_similarities))


0.00977273713666
0.00222865809354
0.00219376504075
0.100524346491

In [7]:
greater_than_0 = len([i for i in string_similarities if i > 0])
total_length = len(call_similarities)
print ('Total length: ', total_length)
print ('Greater than 0: ', greater_than_0)
print ('Fraction greater than 0: ',greater_than_0/total_length)


Total length:  28659
Greater than 0:  24247
Fraction greater than 0:  0.8460518510764506

We now have a much higher fraction with some similarity


In [8]:
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt

In [9]:
plt.rcParams['figure.figsize'] = (20, 10)

plt.hist([i for i in string_similarities if i !=0],bins=50)
plt.show()



In [11]:
# Finding examples of cells that have a high string similarity
total_examples = 10
for key in rd:
    if rd[key]['string_similarity'] > .3:
        if len(cls[int(key[0])].get_feature('strings')) < 20:
            continue
        print ("LOOKING AT DIFFERENCE BETWEEN CODE X AND Y")
        print ("CODE X:")
        for line in cls[int(key[0])].get_feature('code').split('\n'):
            print (line)
        print ("\n\nCODE Y:")
        for line in cls[int(key[1])].get_feature('code').split('\n'):
            print (line)
        print ("\n\n\n\n\n\n")
        total_examples -= 1
    if total_examples == 0:
        break


LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:

# coding: utf-8

# In[ ]:

import pandas as pd
import numpy as np
import os

from sklearn.linear_model import LinearRegression
from sklearn import metrics

import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')




CODE Y:

# coding: utf-8

# In[ ]:

get_ipython().magic('matplotlib inline')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.options.mode.chained_assignment = None  # default='warn'
import warnings
warnings.filterwarnings('ignore')









LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:

# coding: utf-8

# In[ ]:

import pandas as pd
import numpy as np
import os

from sklearn.linear_model import LinearRegression
from sklearn import metrics

import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')




CODE Y:

# coding: utf-8

# In[ ]:

get_ipython().magic('matplotlib inline')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from nltk import word_tokenize

from string import punctuation
from nltk.corpus import stopwords
import nltk

import json
import seaborn as sns
sns.set_context('notebook')









LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:

# coding: utf-8

# In[ ]:

import pandas as pd
import numpy as np
import os

from sklearn.linear_model import LinearRegression
from sklearn import metrics

import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')




CODE Y:

# coding: utf-8

# In[ ]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt









LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:

# coding: utf-8

# In[ ]:

get_ipython().magic('matplotlib inline')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.options.mode.chained_assignment = None  # default='warn'
import warnings
warnings.filterwarnings('ignore')




CODE Y:

# coding: utf-8

# In[ ]:

get_ipython().magic('matplotlib inline')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from nltk import word_tokenize

from string import punctuation
from nltk.corpus import stopwords
import nltk

import json
import seaborn as sns
sns.set_context('notebook')









LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:

# coding: utf-8

# In[ ]:

get_ipython().magic('matplotlib inline')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.options.mode.chained_assignment = None  # default='warn'
import warnings
warnings.filterwarnings('ignore')




CODE Y:

# coding: utf-8

# In[ ]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt









LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:

# coding: utf-8

# In[ ]:

tweetId = [item['id'] for item in t1]
user_id = [item['user']['id'] for item in t1]
favorite = [item['favorite_count'] for item in t1]
date = [item['created_at'] for item in t1]
retweet_count = [item['retweet_count'] for item in t1]
place = [item['place'] for item in t1]
id_str = [item['id_str'] for item in t1]
# user = [item['retweeted_status'] for item in t1]
epfl_df = pd.DataFrame({'tweetId': tweetId , 'id_str':id_str,'user_id': user_id,'favorite': favorite,'date': date,'retweet_count': retweet_count,'place':place})




CODE Y:

# coding: utf-8

# In[ ]:

epfl_en['user_id'] = pd.DataFrame([user['id_str'] for user in epfl_en.user])
epfl_en['user_id'].head()









LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:

# coding: utf-8

# In[ ]:

tweetId = [item['id'] for item in t2]
user_id = [item['user']['id'] for item in t2]
favorite = [item['favorite_count'] for item in t2]
date = [item['created_at'] for item in t2]
retweet_count = [item['retweet_count'] for item in t2]
place = [item['place'] for item in t2]
id_str = [item['id_str'] for item in t2]
# user = [item['retweeted_status'] for item in t2]
eth_df = pd.DataFrame({'tweetId': tweetId , 'id_str':id_str,'user_id': user_id,'favorite': favorite,'date': date,'retweet_count': retweet_count,'place':place})




CODE Y:

# coding: utf-8

# In[ ]:

epfl_en['user_id'] = pd.DataFrame([user['id_str'] for user in epfl_en.user])
epfl_en['user_id'].head()









LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:

# coding: utf-8

# In[ ]:

columns_with_nan = pd.isnull(small_epfl).sum() > 0
columns_with_nan = list(columns_with_nan[columns_with_nan].index)
print(columns_with_nan)




CODE Y:

# coding: utf-8

# In[ ]:

for index, raw in df_big_with_hashtags.iterrows():
    for hashtag in raw['hashtag_list']:
        print(hashtag)
    break









LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:

# coding: utf-8

# In[ ]:

columns_with_nan = pd.isnull(small_eth).sum() > 0
columns_with_nan = list(columns_with_nan[columns_with_nan].index)
print(columns_with_nan)




CODE Y:

# coding: utf-8

# In[ ]:

for index, raw in df_big_with_hashtags.iterrows():
    for hashtag in raw['hashtag_list']:
        print(hashtag)
    break









LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:

# coding: utf-8

# In[ ]:

get_ipython().magic('matplotlib inline')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from nltk import word_tokenize

from string import punctuation
from nltk.corpus import stopwords
import nltk

import json
import seaborn as sns
sns.set_context('notebook')




CODE Y:

# coding: utf-8

# In[ ]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt










In [12]:
# Finding examples of cells that have a high string similarity
total_examples = 50
for key in rd:
    if rd[key]['string_similarity'] > .5:
        if len(cls[int(key[0])].get_feature('strings')) < 10:
            continue
        print ("LOOKING AT DIFFERENCE BETWEEN CODE X AND Y")
        print ("CODE X:")
        for line in cls[int(key[0])].get_feature('strings'):
            print (line)
        print ("\n\nCODE Y:")
        for line in cls[int(key[1])].get_feature('strings'):
            print (line)
        print ("\n\n\n\n\n\n")
        total_examples -= 1
    if total_examples == 0:
        break


LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:
coding
utf
read
json
epfl
json
typ
dataframe
read
json
eth
json
typ
dataframe


CODE Y:
coding
utf
epfl
read
json
epfl
json
epfl
head







LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:
coding
utf
read
json
epfl
json
typ
dataframe
read
json
eth
json
typ
dataframe


CODE Y:
coding
utf
eth
read
json
eth
json
eth
head







LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:
coding
utf
read
json
epfl
json
typ
dataframe
read
json
eth
json
typ
dataframe


CODE Y:
coding
utf
read
json
epfl
json
read
json
eth
json
print
EPFL
head







LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:
coding
utf
read
json
epfl
json
typ
dataframe
read
json
eth
json
typ
dataframe


CODE Y:
coding
utf
EPFL
read
json
epfl
json
ETH
read
json
eth
json







LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:
coding
utf
lda
gensim
models
ldamodel
LdaModel
corpus
word
dictionary
num
topics
num
topics


CODE Y:
coding
utf
lda
ldamodel
LdaModel
corpus
word
dictionary
num
topics
corpus
lda
lda
corpus
for
range
print
lda
print
topic







LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:
coding
utf
lda
gensim
models
ldamodel
LdaModel
corpus
word
dictionary
num
topics
num
topics


CODE Y:
coding
utf
lda
ldamodel
LdaModel
corpus
word
dictionary
num
topics
corpus
lda
lda
corpus
for
range
print
lda
print
topic







LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:
coding
utf
num
topics
num
top
words
lda
gensim
models
ldamodel
LdaModel
corpus
word
dictionary
num
topics
num
topics


CODE Y:
coding
utf
lda
ldamodel
LdaModel
corpus
word
dictionary
num
topics
corpus
lda
lda
corpus
for
range
print
lda
print
topic







LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:
coding
utf
num
topics
num
top
words
lda
gensim
models
ldamodel
LdaModel
corpus
word
dictionary
num
topics
num
topics


CODE Y:
coding
utf
lda
ldamodel
LdaModel
corpus
word
dictionary
num
topics
corpus
lda
lda
corpus
for
range
print
lda
print
topic







LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:
coding
utf
read
json
epfl
json
read
json
eth
json
print
EPFL
head


CODE Y:
coding
utf
EPFL
read
json
epfl
json
ETH
read
json
eth
json








In [ ]: